Link for the dataset: https://drive.google.com/file/d/1VHQdYYHldT0n1gDTcwpuojg1eIWqX3MD/view?usp=sharing
import tensorflow as tf
from keras.layers import Input, Dense
from keras.models import Model, Sequential
from keras import regularizers
from keras.utils import plot_model
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import average_precision_score, precision_recall_curve, plot_precision_recall_curve
from sklearn.utils import resample
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
np.random.seed(62)
tf.random.set_seed(62)
tf.keras.backend.clear_session()
Read the data into dataframe
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Project/creditcard.csv")
df["Time"] = df["Time"].apply(lambda x : x / 3600 % 24)
df.head()
Visualize class distribution
sns.set_theme(style="darkgrid")
sns.countplot(x="Class", data=df)
# Split the normal and fraud instance, use only 2000 normal instances
normal = df[df['Class'] == 0].sample(2000)
fraud = df[df['Class'] == 1]
print("Number of Nomral:", normal.shape[0], ", Number of Fraud:", fraud.shape[0])
# drop target colounm
df_view = normal.append(fraud).sample(frac=1).reset_index(drop=True)
X = df_view.drop(['Class'], axis = 1).values
Y = df_view["Class"].values
X.shape
def tsne_plot(x1, y1, name="graph"):
tsne = TSNE(n_components=2, random_state=0)
X_t = tsne.fit_transform(x1)
plt.figure(figsize=(12, 8))
plt.scatter(X_t[np.where(y1 == 0), 0], X_t[np.where(y1 == 0), 1], marker='o', color='g', linewidth='1', alpha=0.8, label='Normal')
plt.scatter(X_t[np.where(y1 == 1), 0], X_t[np.where(y1 == 1), 1], marker='o', color='r', linewidth='1', alpha=0.8, label='Fraud')
plt.legend(loc='best');
plt.title(name)
plt.show();
tsne_plot(X, Y, "Original Without Autoencoder")
Data preprocessing
x = df.drop(["Class"], axis=1)
y = df["Class"].values
x_scale = preprocessing.MinMaxScaler().fit_transform(x.values)
x_norm, x_fraud = x_scale[y == 0], x_scale[y == 1]
x_norm[0]
# input layer
in_layer = Input(shape=(X.shape[1],),name="M1_input_ly")
# encoder layer
en_layers1 = Dense(100, activation='tanh', activity_regularizer=regularizers.l1(10e-5), name="M1_en_ly1")(in_layer)
en_layers2 = Dense(50, activation='tanh', name="M1_en_ly2")(en_layers1)
# decoder layer
de_layers1 = Dense(50, activation='tanh', name="M1_de_ly1")(en_layers2)
de_layers2 = Dense(100, activation='tanh', name="M1_de_ly2")(de_layers1)
# output layer
out_layer = Dense(X.shape[1], activation='tanh',name="M1_output_ly")(de_layers2)
# combine encoder and decoder
autoencoder_01 = Model(in_layer, out_layer)
autoencoder_01.compile(optimizer="adadelta", loss="mse")
autoencoder_01.summary()
plot_model(autoencoder_01,show_shapes=True)
au_01_his = autoencoder_01.fit(x_norm[0:10000], x_norm[0:10000],
batch_size = 5, epochs = 12,
shuffle = True, validation_split = 0.20);
pd.DataFrame(au_01_his.history).plot(figsize=(8, 5))
plt.grid(True)
plt.title("Model 1 training")
plt.gca().set_ylim(0, 1)
plt.show()
# input layer
in_layer = Input(shape=(X.shape[1],),name="M2_input_ly")
# encoder layer
en_layers1 = Dense(100, activation='tanh', activity_regularizer=regularizers.l1(10e-5), name="M2_en_ly1")(in_layer)
en_layers2 = Dense(50, activation='tanh', name="M2_en_ly2")(en_layers1)
# middle bottleneck block
mid_layer = Dense(30, activation='tanh', name="M2_mid_ly")(en_layers2)
# decoder layer
de_layers1 = Dense(50, activation='tanh', name="M2_de_ly1")(mid_layer)
de_layers2 = Dense(100, activation='tanh', name="M2_de_ly2")(de_layers1)
# output layer
out_layer = Dense(X.shape[1], activation='tanh',name="M2_output_ly")(de_layers2)
# combine encoder and decoder
autoencoder_02 = Model(in_layer, out_layer)
autoencoder_02.compile(optimizer="adadelta", loss="mse")
autoencoder_02.summary()
plot_model(autoencoder_02,show_shapes=True)
au_02_his = autoencoder_02.fit(x_norm[0:10000], x_norm[0:10000],
batch_size = 5, epochs = 12,
shuffle = True, validation_split = 0.20);
pd.DataFrame(au_02_his.history).plot(figsize=(8, 5))
plt.grid(True)
plt.title("Model 2 training")
plt.gca().set_ylim(0, 1)
plt.show()
# input layer
in_layer = Input(shape=(X.shape[1],),name="M3_input_ly")
# encoder layer
en_layers1 = Dense(100, activation='tanh', name="M3_en_ly1")(in_layer)
en_layers2 = Dense(50, activation='tanh', name="M3_en_ly2")(en_layers1)
# decoder layer
de_layers1 = Dense(50, activation='tanh', name="M3_de_ly1")(en_layers2)
de_layers2 = Dense(100, activation='tanh', name="M3_de_ly2")(de_layers1)
# output layer
out_layer = Dense(X.shape[1], activation='tanh',name="M3_output_ly")(de_layers2)
# combine encoder and decoder
autoencoder_03 = Model(in_layer, out_layer)
autoencoder_03.compile(optimizer="adadelta", loss="mse")
autoencoder_03.summary()
plot_model(autoencoder_03,show_shapes=True)
au_03_his = autoencoder_03.fit(x_norm[0:10000], x_norm[0:10000],
batch_size = 5, epochs = 12,
shuffle = True, validation_split = 0.20);
pd.DataFrame(au_03_his.history).plot(figsize=(8, 5))
plt.grid(True)
plt.title("Model 3 training")
plt.gca().set_ylim(0, 1)
plt.show()
# input layer
in_layer = Input(shape=(X.shape[1],),name="M4_input_ly")
# encoder layer
en_layers1 = Dense(100, activation='tanh', name="M4_en_ly1")(in_layer)
en_layers2 = Dense(50, activation='tanh', name="M4_en_ly2")(en_layers1)
# middle bottleneck block
mid_layer = Dense(30, activation='tanh', name="M4_mid_ly")(en_layers2)
# decoder layer
de_layers1 = Dense(50, activation='tanh', name="M4_de_ly1")(mid_layer)
de_layers2 = Dense(100, activation='tanh', name="M4_de_ly2")(de_layers1)
# output layer
out_layer = Dense(X.shape[1], activation='tanh',name="M4_output_ly")(de_layers2)
# combine encoder and decoder
autoencoder_04 = Model(in_layer, out_layer)
autoencoder_04.compile(optimizer="adadelta", loss="mse")
autoencoder_04.summary()
plot_model(autoencoder_04,show_shapes=True)
au_04_his = autoencoder_04.fit(x_norm[0:10000], x_norm[0:10000],
batch_size = 5, epochs = 12,
shuffle = True, validation_split = 0.20);
pd.DataFrame(au_04_his.history).plot(figsize=(8, 5))
plt.grid(True)
plt.title("Model 4 training")
plt.gca().set_ylim(0, 1)
plt.show()
# input layer
in_layer = Input(shape=(X.shape[1],),name="M5_input_ly")
# encoder layer
en_layers1 = Dense(20, activation='tanh', name="M5_en_ly1")(in_layer)
en_layers2 = Dense(10, activation='tanh', name="M5_en_ly2")(en_layers1)
# decoder layer
de_layers1 = Dense(10, activation='tanh', name="M5_de_ly1")(en_layers2)
de_layers2 = Dense(20, activation='tanh', name="M5_de_ly2")(de_layers1)
# output layer
out_layer = Dense(X.shape[1], activation='tanh',name="M5_output_ly")(de_layers2)
# combine encoder and decoder
autoencoder_05 = Model(in_layer, out_layer)
autoencoder_05.compile(optimizer="adadelta", loss="mse")
autoencoder_05.summary()
plot_model(autoencoder_05,show_shapes=True)
au_05_his = autoencoder_05.fit(x_norm[0:10000], x_norm[0:10000],
batch_size = 5, epochs = 12,
shuffle = True, validation_split = 0.20);
pd.DataFrame(au_05_his.history).plot(figsize=(8, 5))
plt.grid(True)
plt.title("Model 5 training")
plt.gca().set_ylim(0, 1)
plt.show()
# input layer
in_layer = Input(shape=(X.shape[1],),name="M6_input_ly")
# encoder layer
en_layers1 = Dense(20, activation='tanh', name="M6_en_ly1")(in_layer)
en_layers2 = Dense(10, activation='tanh', name="M6_en_ly2")(en_layers1)
# middle bottleneck block
mid_layer = Dense(5, activation='tanh', name="M6_mid_ly")(en_layers2)
# decoder layer
de_layers1 = Dense(10, activation='tanh', name="M6_de_ly1")(mid_layer)
de_layers2 = Dense(20, activation='tanh', name="M6_de_ly2")(de_layers1)
# output layer
out_layer = Dense(X.shape[1], activation='tanh',name="M6_output_ly")(de_layers2)
# combine encoder and decoder
autoencoder_06 = Model(in_layer, out_layer)
autoencoder_06.compile(optimizer="adadelta", loss="mse")
autoencoder_06.summary()
plot_model(autoencoder_06,show_shapes=True)
au_06_his = autoencoder_06.fit(x_norm[0:10000], x_norm[0:10000],
batch_size = 5, epochs = 12,
shuffle = True, validation_split = 0.20);
pd.DataFrame(au_06_his.history).plot(figsize=(8, 5))
plt.grid(True)
plt.title("Model 6 training")
plt.gca().set_ylim(0, 1)
plt.show()
# input layer
in_layer = Input(shape=(X.shape[1],),name="M7_input_ly")
# encoder layer
en_layers1 = Dense(20, activation='tanh', name="M7_en_ly1")(in_layer)
en_layers2 = Dense(10, activation='tanh', name="M7_en_ly2")(en_layers1)
en_layers3 = Dense(8, activation='tanh', name="M7_en_ly3")(en_layers2)
en_layers4 = Dense(4, activation='tanh', name="M7_en_ly4")(en_layers3)
# decoder layer
de_layers1 = Dense(4, activation='tanh', name="M7_de_ly1")(en_layers4)
de_layers2 = Dense(8, activation='tanh', name="M7_de_ly2")(de_layers1)
de_layers3 = Dense(10, activation='tanh', name="M7_de_ly3")(de_layers2)
de_layers4 = Dense(20, activation='tanh', name="M7_de_ly4")(de_layers3)
# output layer
out_layer = Dense(X.shape[1], activation='tanh',name="M7_output_ly")(de_layers4)
# combine encoder and decoder
autoencoder_07 = Model(in_layer, out_layer)
autoencoder_07.compile(optimizer="adadelta", loss="mse")
autoencoder_07.summary()
plot_model(autoencoder_07,show_shapes=True)
au_07_his = autoencoder_07.fit(x_norm[0:10000], x_norm[0:10000],
batch_size = 5, epochs = 12,
shuffle = True, validation_split = 0.20);
pd.DataFrame(au_07_his.history).plot(figsize=(8, 5))
plt.grid(True)
plt.title("Model 7 training")
plt.gca().set_ylim(0, 1)
plt.show()
# input layer
in_layer = Input(shape=(X.shape[1],),name="M8_input_ly")
# encoder layer
en_layers1 = Dense(20, activation='tanh', name="M8_en_ly1")(in_layer)
en_layers2 = Dense(10, activation='tanh', name="M8_en_ly2")(en_layers1)
en_layers3 = Dense(8, activation='tanh', name="M8_en_ly3")(en_layers2)
en_layers4 = Dense(4, activation='tanh', name="M8_en_ly4")(en_layers3)
# middle bottleneck block
mid_layer = Dense(2, activation='tanh', name="M8_mid_ly")(en_layers4)
# decoder layer
de_layers1 = Dense(4, activation='tanh', name="M8_de_ly1")(mid_layer)
de_layers2 = Dense(8, activation='tanh', name="M8_de_ly2")(de_layers1)
de_layers3 = Dense(10, activation='tanh', name="M8_de_ly3")(de_layers2)
de_layers4 = Dense(20, activation='tanh', name="M8_de_ly4")(de_layers3)
# output layer
out_layer = Dense(X.shape[1], activation='tanh',name="M8_output_ly")(de_layers4)
# combine encoder and decoder
autoencoder_08 = Model(in_layer, out_layer)
autoencoder_08.compile(optimizer="adadelta", loss="mse")
autoencoder_08.summary()
plot_model(autoencoder_08,show_shapes=True)
au_08_his = autoencoder_08.fit(x_norm[0:10000], x_norm[0:10000],
batch_size = 5, epochs = 12,
shuffle = True, validation_split = 0.20);
pd.DataFrame(au_08_his.history).plot(figsize=(8, 5))
plt.grid(True)
plt.title("Model 8 training")
plt.gca().set_ylim(0, 1)
plt.show()
Model 1
encoder_layer_rep_m1 = Sequential([
autoencoder_01.layers[0],
autoencoder_01.layers[1],
autoencoder_01.layers[2]
])
norm_hid_rep_m1 = encoder_layer_rep_m1.predict(x_norm[:3000])
fraud_hid_rep_m1 = encoder_layer_rep_m1.predict(x_fraud[:200])
## Visualize the encoder representaion using TSNE
rep_x = np.append(norm_hid_rep_m1, fraud_hid_rep_m1, axis = 0)
y_n = np.zeros(norm_hid_rep_m1.shape[0])
y_f = np.ones(fraud_hid_rep_m1.shape[0])
rep_y = np.append(y_n, y_f)
tsne_plot(rep_x, rep_y, "Model 1")
Model 2
encoder_layer_rep_m2 = Sequential([
autoencoder_02.layers[0],
autoencoder_02.layers[1],
autoencoder_02.layers[2],
autoencoder_02.layers[3]
])
norm_hid_rep_m2 = encoder_layer_rep_m2.predict(x_norm[:3000])
fraud_hid_rep_m2 = encoder_layer_rep_m2.predict(x_fraud[:200])
## Visualize the encoder representaion using TSNE
rep_x = np.append(norm_hid_rep_m2, fraud_hid_rep_m2, axis = 0)
# rep_y are the same for all model
tsne_plot(rep_x, rep_y, "Model 2")
Model 3
encoder_layer_rep_m3 = Sequential([
autoencoder_03.layers[0],
autoencoder_03.layers[1],
autoencoder_03.layers[2]
])
norm_hid_rep_m3 = encoder_layer_rep_m3.predict(x_norm[:3000])
fraud_hid_rep_m3 = encoder_layer_rep_m3.predict(x_fraud[:200])
## Visualize the encoder representaion using TSNE
rep_x = np.append(norm_hid_rep_m3, fraud_hid_rep_m3, axis = 0)
# rep_y are the same for all model
tsne_plot(rep_x, rep_y, "Model 3")
Model 4
encoder_layer_rep_m4 = Sequential([
autoencoder_04.layers[0],
autoencoder_04.layers[1],
autoencoder_04.layers[2],
autoencoder_04.layers[3]
])
norm_hid_rep_m4 = encoder_layer_rep_m4.predict(x_norm[:3000])
fraud_hid_rep_m4 = encoder_layer_rep_m4.predict(x_fraud[:200])
## Visualize the encoder representaion using TSNE
rep_x = np.append(norm_hid_rep_m4, fraud_hid_rep_m4, axis = 0)
# rep_y are the same for all model
tsne_plot(rep_x, rep_y, "Model 4")
Model 5
encoder_layer_rep_m5 = Sequential([
autoencoder_05.layers[0],
autoencoder_05.layers[1],
autoencoder_05.layers[2]
])
norm_hid_rep_m5 = encoder_layer_rep_m5.predict(x_norm[:3000])
fraud_hid_rep_m5 = encoder_layer_rep_m5.predict(x_fraud[:200])
## Visualize the encoder representaion using TSNE
rep_x = np.append(norm_hid_rep_m5, fraud_hid_rep_m5, axis = 0)
# rep_y are the same for all model
tsne_plot(rep_x, rep_y, "Model 5")
Model 6
encoder_layer_rep_m6 = Sequential([
autoencoder_06.layers[0],
autoencoder_06.layers[1],
autoencoder_06.layers[2],
autoencoder_06.layers[3]
])
norm_hid_rep_m6 = encoder_layer_rep_m6.predict(x_norm[:3000])
fraud_hid_rep_m6 = encoder_layer_rep_m6.predict(x_fraud[:200])
## Visualize the encoder representaion using TSNE
rep_x = np.append(norm_hid_rep_m6, fraud_hid_rep_m6, axis = 0)
# rep_y are the same for all model
tsne_plot(rep_x, rep_y, "Model 6")
Model 7
encoder_layer_rep_m7 = Sequential([
autoencoder_07.layers[0],
autoencoder_07.layers[1],
autoencoder_07.layers[2],
autoencoder_07.layers[3],
autoencoder_07.layers[4]
])
norm_hid_rep_m7 = encoder_layer_rep_m7.predict(x_norm[:3000])
fraud_hid_rep_m7 = encoder_layer_rep_m7.predict(x_fraud[:200])
## Visualize the encoder representaion using TSNE
rep_x = np.append(norm_hid_rep_m7, fraud_hid_rep_m7, axis = 0)
# rep_y are the same for all model
tsne_plot(rep_x, rep_y, "Model 7")
Model 8
encoder_layer_rep_m8 = Sequential([
autoencoder_08.layers[0],
autoencoder_08.layers[1],
autoencoder_08.layers[2],
autoencoder_08.layers[3],
autoencoder_08.layers[4],
autoencoder_08.layers[5]
])
norm_hid_rep_m8 = encoder_layer_rep_m8.predict(x_norm[:3000])
fraud_hid_rep_m8 = encoder_layer_rep_m8.predict(x_fraud[:200])
## Visualize the encoder representaion using TSNE
rep_x = np.append(norm_hid_rep_m8, fraud_hid_rep_m8, axis = 0)
# rep_y are the same for all model
tsne_plot(rep_x, rep_y, "Model 8")
Base RF Model Evaluation
train_x, val_x, train_y, val_y = train_test_split(x_scale, y, test_size=0.7, stratify=y)
clf_base = RandomForestClassifier().fit(train_x, train_y)
pred_y_base = clf_base.predict(val_x)
print ("")
print ("Classification Report: ")
print (classification_report(val_y, pred_y_base))
print ("")
print ("Confusion Matrix: \n", confusion_matrix(val_y, pred_y_base, labels=[1,0]))
print ("Accuracy Score: ", accuracy_score(val_y, pred_y_base))
print ("Precision Score: ", precision_score(val_y, pred_y_base))
print ("Recall Score: ", recall_score(val_y, pred_y_base))
print ("F1 Score: ", f1_score(val_y, pred_y_base))
# PR-AUC
disp_base = plot_precision_recall_curve(clf_base, val_x, val_y)
disp_base.ax_.set_title('Base RF Precision-Recall curve')
Model 1 Evaluation
train_x_transform_m1 = encoder_layer_rep_m1.predict(train_x)
val_x_transform_m1 = encoder_layer_rep_m1.predict(val_x)
clf_m1 = RandomForestClassifier().fit(train_x_transform_m1, train_y)
pred_y_m1 = clf_m1.predict(val_x_transform_m1)
print ("")
print ("Classification Report: ")
print (classification_report(val_y, pred_y_m1))
print ("")
print ("Confusion Matrix: \n", confusion_matrix(val_y, pred_y_m1, labels=[1,0]))
print ("Accuracy Score: ", accuracy_score(val_y, pred_y_m1))
print ("Precision Score: ", precision_score(val_y, pred_y_m1))
print ("Recall Score: ", recall_score(val_y, pred_y_m1))
print ("F1 Score: ", f1_score(val_y, pred_y_m1))
# PR-AUC
disp_m1 = plot_precision_recall_curve(clf_m1, val_x_transform_m1, val_y)
disp_m1.ax_.set_title('Model 1 Precision-Recall curve')
Model 2 Evaluation
train_x_transform_m2 = encoder_layer_rep_m2.predict(train_x)
val_x_transform_m2 = encoder_layer_rep_m2.predict(val_x)
clf_m2 = RandomForestClassifier().fit(train_x_transform_m2, train_y)
pred_y_m2 = clf_m2.predict(val_x_transform_m2)
print ("")
print ("Classification Report: ")
print (classification_report(val_y, pred_y_m2))
print ("")
print ("Confusion Matrix: \n", confusion_matrix(val_y, pred_y_m2, labels=[1,0]))
print ("Accuracy Score: ", accuracy_score(val_y, pred_y_m2))
print ("Precision Score: ", precision_score(val_y, pred_y_m2))
print ("Recall Score: ", recall_score(val_y, pred_y_m2))
print ("F1 Score: ", f1_score(val_y, pred_y_m2))
# PR-AUC
disp_m2 = plot_precision_recall_curve(clf_m2, val_x_transform_m2, val_y)
disp_m2.ax_.set_title('Model 2 Precision-Recall curve')
Model 3 Evaluation
train_x_transform_m3 = encoder_layer_rep_m3.predict(train_x)
val_x_transform_m3 = encoder_layer_rep_m3.predict(val_x)
clf_m3 = RandomForestClassifier().fit(train_x_transform_m3, train_y)
pred_y_m3 = clf_m3.predict(val_x_transform_m3)
print ("")
print ("Classification Report: ")
print (classification_report(val_y, pred_y_m3))
print ("")
print ("Confusion Matrix: \n", confusion_matrix(val_y, pred_y_m3, labels=[1,0]))
print ("Accuracy Score: ", accuracy_score(val_y, pred_y_m3))
print ("Precision Score: ", precision_score(val_y, pred_y_m3))
print ("Recall Score: ", recall_score(val_y, pred_y_m3))
print ("F1 Score: ", f1_score(val_y, pred_y_m3))
# PR-AUC
disp_m3 = plot_precision_recall_curve(clf_m3, val_x_transform_m3, val_y)
disp_m3.ax_.set_title('Model 3 Precision-Recall curve')
Model 4 Evaluation
train_x_transform_m4 = encoder_layer_rep_m4.predict(train_x)
val_x_transform_m4 = encoder_layer_rep_m4.predict(val_x)
clf_m4 = RandomForestClassifier().fit(train_x_transform_m4, train_y)
pred_y_m4 = clf_m4.predict(val_x_transform_m4)
print ("")
print ("Classification Report: ")
print (classification_report(val_y, pred_y_m4))
print ("")
print ("Confusion Matrix: \n", confusion_matrix(val_y, pred_y_m4, labels=[1,0]))
print ("Accuracy Score: ", accuracy_score(val_y, pred_y_m4))
print ("Precision Score: ", precision_score(val_y, pred_y_m4))
print ("Recall Score: ", recall_score(val_y, pred_y_m4))
print ("F1 Score: ", f1_score(val_y, pred_y_m4))
# PR-AUC
disp_m4 = plot_precision_recall_curve(clf_m4, val_x_transform_m4, val_y)
disp_m4.ax_.set_title('Model 4 Precision-Recall curve')
Model 5 Evaluation
train_x_transform_m5 = encoder_layer_rep_m5.predict(train_x)
val_x_transform_m5 = encoder_layer_rep_m5.predict(val_x)
clf_m5 = RandomForestClassifier().fit(train_x_transform_m5, train_y)
pred_y_m5 = clf_m5.predict(val_x_transform_m5)
print ("")
print ("Classification Report: ")
print (classification_report(val_y, pred_y_m5))
print ("")
print ("Confusion Matrix: \n", confusion_matrix(val_y, pred_y_m5, labels=[1,0]))
print ("Accuracy Score: ", accuracy_score(val_y, pred_y_m5))
print ("Precision Score: ", precision_score(val_y, pred_y_m5))
print ("Recall Score: ", recall_score(val_y, pred_y_m5))
print ("F1 Score: ", f1_score(val_y, pred_y_m5))
# PR-AUC
disp_m5 = plot_precision_recall_curve(clf_m5, val_x_transform_m5, val_y)
disp_m5.ax_.set_title('Model 5 Precision-Recall curve')
Model 6 Evaluation
train_x_transform_m6 = encoder_layer_rep_m6.predict(train_x)
val_x_transform_m6 = encoder_layer_rep_m6.predict(val_x)
clf_m6 = RandomForestClassifier().fit(train_x_transform_m6, train_y)
pred_y_m6 = clf_m6.predict(val_x_transform_m6)
print ("")
print ("Classification Report: ")
print (classification_report(val_y, pred_y_m6))
print ("")
print ("Confusion Matrix: \n", confusion_matrix(val_y, pred_y_m6, labels=[1,0]))
print ("Accuracy Score: ", accuracy_score(val_y, pred_y_m6))
print ("Precision Score: ", precision_score(val_y, pred_y_m6))
print ("Recall Score: ", recall_score(val_y, pred_y_m6))
print ("F1 Score: ", f1_score(val_y, pred_y_m6))
# PR-AUC
disp_m6 = plot_precision_recall_curve(clf_m6, val_x_transform_m6, val_y)
disp_m6.ax_.set_title('Model 6 Precision-Recall curve')
Model 7 Evaluation
train_x_transform_m7 = encoder_layer_rep_m7.predict(train_x)
val_x_transform_m7 = encoder_layer_rep_m7.predict(val_x)
clf_m7 = RandomForestClassifier().fit(train_x_transform_m7, train_y)
pred_y_m7 = clf_m7.predict(val_x_transform_m7)
print ("")
print ("Classification Report: ")
print (classification_report(val_y, pred_y_m7))
print ("")
print ("Confusion Matrix: \n", confusion_matrix(val_y, pred_y_m7, labels=[1,0]))
print ("Accuracy Score: ", accuracy_score(val_y, pred_y_m7))
print ("Precision Score: ", precision_score(val_y, pred_y_m7))
print ("Recall Score: ", recall_score(val_y, pred_y_m7))
print ("F1 Score: ", f1_score(val_y, pred_y_m7))
# PR-AUC
disp_m7 = plot_precision_recall_curve(clf_m7, val_x_transform_m7, val_y)
disp_m7.ax_.set_title('Model 7 Precision-Recall curve')
Model 8 Evaluation
train_x_transform_m8 = encoder_layer_rep_m8.predict(train_x)
val_x_transform_m8 = encoder_layer_rep_m8.predict(val_x)
clf_m8 = RandomForestClassifier().fit(train_x_transform_m8, train_y)
pred_y_m8 = clf_m8.predict(val_x_transform_m8)
print ("")
print ("Classification Report: ")
print (classification_report(val_y, pred_y_m8))
print ("")
print ("Confusion Matrix: \n", confusion_matrix(val_y, pred_y_m8, labels=[1,0]))
print ("Accuracy Score: ", accuracy_score(val_y, pred_y_m8))
print ("Precision Score: ", precision_score(val_y, pred_y_m8))
print ("Recall Score: ", recall_score(val_y, pred_y_m8))
print ("F1 Score: ", f1_score(val_y, pred_y_m8))
# PR-AUC
disp_m8 = plot_precision_recall_curve(clf_m8, val_x_transform_m8, val_y)
disp_m8.ax_.set_title('Model 8 Precision-Recall curve')
normal = df[df['Class'] == 0]
fraud = df[df['Class'] == 1]
normal.shape
fraud.shape
# Increase the number of fraud to half of the normal
upsample_fraud = resample(fraud,
replace=True,
n_samples=int(284315/2))
# Join the fraud upsample with normal
resample_data = pd.concat([normal, upsample_fraud])
resample_data.Class.value_counts()
# Scale the resample data
x_res = resample_data.drop(["Class"], axis=1)
y_res = resample_data["Class"].values
x_res_scale = preprocessing.MinMaxScaler().fit_transform(x_res.values)
# Split the resample data
train_x_res, val_x_res, train_y_res, val_y_res = train_test_split(x_res_scale, y_res, test_size=0.7, stratify=y_res)
Model 1
train_x_transform_m1_res = encoder_layer_rep_m1.predict(train_x_res)
val_x_transform_m1_res = encoder_layer_rep_m1.predict(val_x_res)
clf_m1_res = RandomForestClassifier().fit(train_x_transform_m1_res, train_y_res)
pred_y_m1_res = clf_m1_res.predict(val_x_transform_m1_res)
print ("")
print ("Classification Report: ")
print (classification_report(val_y_res, pred_y_m1_res))
print ("")
print ("Confusion Matrix: \n", confusion_matrix(val_y_res, pred_y_m1_res, labels=[1,0]))
print ("Accuracy Score: ", accuracy_score(val_y_res, pred_y_m1_res))
print ("Precision Score: ", precision_score(val_y_res, pred_y_m1_res))
print ("Recall Score: ", recall_score(val_y_res, pred_y_m1_res))
print ("F1 Score: ", f1_score(val_y_res, pred_y_m1_res))
# PR-AUC
disp_m1_res = plot_precision_recall_curve(clf_m1_res, val_x_transform_m1_res, val_y_res)
disp_m1_res.ax_.set_title('Model 1 with Resampling Precision-Recall curve')
Model 5
train_x_transform_m5_res = encoder_layer_rep_m5.predict(train_x_res)
val_x_transform_m5_res = encoder_layer_rep_m5.predict(val_x_res)
clf_m5_res = RandomForestClassifier().fit(train_x_transform_m5_res, train_y_res)
pred_y_m5_res = clf_m5_res.predict(val_x_transform_m5_res)
print ("")
print ("Classification Report: ")
print (classification_report(val_y_res, pred_y_m5_res))
print ("")
print ("Confusion Matrix: \n", confusion_matrix(val_y_res, pred_y_m5_res, labels=[1,0]))
print ("Accuracy Score: ", accuracy_score(val_y_res, pred_y_m5_res))
print ("Precision Score: ", precision_score(val_y_res, pred_y_m5_res))
print ("Recall Score: ", recall_score(val_y_res, pred_y_m5_res))
print ("F1 Score: ", f1_score(val_y_res, pred_y_m5_res))
# PR-AUC
disp_m5_res = plot_precision_recall_curve(clf_m5_res, val_x_transform_m5_res, val_y_res)
disp_m5_res.ax_.set_title('Model 5 with Resampling Precision-Recall curve')
Model 1
Visualization
norm_recon_rep_m1 = autoencoder_01.predict(x_norm[:3000])
fraud_recon_rep_m1 = autoencoder_01.predict(x_fraud[:200])
## Visualize the encoder representaion using TSNE
rep_x = np.append(norm_recon_rep_m1, fraud_recon_rep_m1, axis = 0)
# rep_y are the same for all model
tsne_plot(rep_x, rep_y, "Model 1 Reconstruction Representation")
Classification
train_x_reError_m1 = autoencoder_01.predict(train_x)
val_x_reError_m1 = autoencoder_01.predict(val_x)
clf_reError_m1 = RandomForestClassifier().fit(train_x_reError_m1, train_y)
pred_y_reError_m1 = clf_reError_m1.predict(val_x_reError_m1)
print ("")
print ("Classification Report: ")
print (classification_report(val_y, pred_y_reError_m1))
print ("")
print ("Confusion Matrix: \n", confusion_matrix(val_y, pred_y_reError_m1, labels=[1,0]))
print ("Accuracy Score: ", accuracy_score(val_y, pred_y_reError_m1))
print ("Precision Score: ", precision_score(val_y, pred_y_reError_m1))
print ("Recall Score: ", recall_score(val_y, pred_y_reError_m1))
print ("F1 Score: ", f1_score(val_y, pred_y_reError_m1))
# PR-AUC
disp_reError_m1 = plot_precision_recall_curve(clf_reError_m1, val_x_reError_m1, val_y)
disp_reError_m1.ax_.set_title('Model 1 Using Reconstruction Errors Precision-Recall curve')
Model 5
Visualization
norm_recon_rep_m5 = autoencoder_05.predict(x_norm[:3000])
fraud_recon_rep_m5 = autoencoder_05.predict(x_fraud[:200])
## Visualize the encoder representaion using TSNE
rep_x = np.append(norm_recon_rep_m5, fraud_recon_rep_m5, axis = 0)
# rep_y are the same for all model
tsne_plot(rep_x, rep_y, "Model 5 Reconstruction Representation")
Classification
train_x_reError_m5 = autoencoder_05.predict(train_x)
val_x_reError_m5 = autoencoder_05.predict(val_x)
clf_reError_m5 = RandomForestClassifier().fit(train_x_reError_m5, train_y)
pred_y_reError_m5 = clf_reError_m5.predict(val_x_reError_m5)
print ("")
print ("Classification Report: ")
print (classification_report(val_y, pred_y_reError_m5))
print ("")
print ("Confusion Matrix: \n", confusion_matrix(val_y, pred_y_reError_m5, labels=[1,0]))
print ("Accuracy Score: ", accuracy_score(val_y, pred_y_reError_m5))
print ("Precision Score: ", precision_score(val_y, pred_y_reError_m5))
print ("Recall Score: ", recall_score(val_y, pred_y_reError_m5))
print ("F1 Score: ", f1_score(val_y, pred_y_reError_m5))
# PR-AUC
disp_reError_m5 = plot_precision_recall_curve(clf_reError_m5, val_x_reError_m5, val_y)
disp_reError_m5.ax_.set_title('Model 5 Using Reconstruction Errors Precision-Recall curve')
Model building
# input layer
in_layer = Input(shape=(X.shape[1],),name="M1relu_input_ly")
# encoder layer
en_layers1 = Dense(100, activation='relu', activity_regularizer=regularizers.l1(10e-5), name="M1relu_en_ly1")(in_layer)
en_layers2 = Dense(50, activation='relu', name="M1relu_en_ly2")(en_layers1)
# decoder layer
de_layers1 = Dense(50, activation='relu', name="M1relu_de_ly1")(en_layers2)
de_layers2 = Dense(100, activation='relu', name="M1relu_de_ly2")(de_layers1)
# output layer
out_layer = Dense(X.shape[1], activation='relu',name="M1relu_output_ly")(de_layers2)
# combine encoder and decoder
autoencoder_01_relu = Model(in_layer, out_layer)
autoencoder_01_relu.compile(optimizer="adadelta", loss="mse")
autoencoder_01_relu.summary()
plot_model(autoencoder_01_relu,show_shapes=True)
au_01_relu_his = autoencoder_01_relu.fit(x_norm[0:10000], x_norm[0:10000],
batch_size = 5, epochs = 12,
shuffle = True, validation_split = 0.20);
pd.DataFrame(au_01_relu_his.history).plot(figsize=(8, 5))
plt.grid(True)
plt.title("Model 1 RELU training")
plt.gca().set_ylim(0, 1)
plt.show()
Visualization
encoder_layer_rep_m1_relu = Sequential([
autoencoder_01_relu.layers[0],
autoencoder_01_relu.layers[1],
autoencoder_01_relu.layers[2]
])
norm_hid_rep_m1_relu = encoder_layer_rep_m1_relu.predict(x_norm[:3000])
fraud_hid_rep_m1_relu = encoder_layer_rep_m1_relu.predict(x_fraud[:200])
## Visualize the encoder representaion using TSNE
rep_x = np.append(norm_hid_rep_m1_relu, fraud_hid_rep_m1_relu, axis = 0)
# rep_y are the same for all model
tsne_plot(rep_x, rep_y, "Model 1 RELU")
Evaluation
train_x_transform_m1_relu = encoder_layer_rep_m1_relu.predict(train_x)
val_x_transform_m1_relu = encoder_layer_rep_m1_relu.predict(val_x)
clf_m1_relu = RandomForestClassifier().fit(train_x_transform_m1_relu, train_y)
pred_y_m1_relu = clf_m1_relu.predict(val_x_transform_m1_relu)
print ("")
print ("Classification Report: ")
print (classification_report(val_y, pred_y_m1_relu))
print ("")
print ("Confusion Matrix: \n", confusion_matrix(val_y, pred_y_m1_relu, labels=[1,0]))
print ("Accuracy Score: ", accuracy_score(val_y, pred_y_m1_relu))
print ("Precision Score: ", precision_score(val_y, pred_y_m1_relu))
print ("Recall Score: ", recall_score(val_y, pred_y_m1_relu))
print ("F1 Score: ", f1_score(val_y, pred_y_m1_relu))
# PR-AUC
disp_m1_relu = plot_precision_recall_curve(clf_m1_relu, val_x_transform_m1_relu, val_y)
disp_m1_relu.ax_.set_title('Model 1 RELU Precision-Recall curve')
Model building
# input layer
in_layer = Input(shape=(X.shape[1],),name="M1elu_input_ly")
# encoder layer
en_layers1 = Dense(100, activation='elu', activity_regularizer=regularizers.l1(10e-5), name="M1elu_en_ly1")(in_layer)
en_layers2 = Dense(50, activation='elu', name="M1elu_en_ly2")(en_layers1)
# decoder layer
de_layers1 = Dense(50, activation='elu', name="M1elu_de_ly1")(en_layers2)
de_layers2 = Dense(100, activation='elu', name="M1elu_de_ly2")(de_layers1)
# output layer
out_layer = Dense(X.shape[1], activation='elu',name="M1elu_output_ly")(de_layers2)
# combine encoder and decoder
autoencoder_01_elu = Model(in_layer, out_layer)
autoencoder_01_elu.compile(optimizer="adadelta", loss="mse")
autoencoder_01_elu.summary()
plot_model(autoencoder_01_elu,show_shapes=True)
au_01_elu_his = autoencoder_01_elu.fit(x_norm[0:10000], x_norm[0:10000],
batch_size = 5, epochs = 12,
shuffle = True, validation_split = 0.20);
pd.DataFrame(au_01_elu_his.history).plot(figsize=(8, 5))
plt.grid(True)
plt.title("Model 1 ELU training")
plt.gca().set_ylim(0, 1)
plt.show()
Visualization
encoder_layer_rep_m1_elu = Sequential([
autoencoder_01_elu.layers[0],
autoencoder_01_elu.layers[1],
autoencoder_01_elu.layers[2]
])
norm_hid_rep_m1_elu = encoder_layer_rep_m1_elu.predict(x_norm[:3000])
fraud_hid_rep_m1_elu = encoder_layer_rep_m1_elu.predict(x_fraud[:200])
## Visualize the encoder representaion using TSNE
rep_x = np.append(norm_hid_rep_m1_elu, fraud_hid_rep_m1_elu, axis = 0)
# rep_y are the same for all model
tsne_plot(rep_x, rep_y, "Model 1 ELU")
Evaluation
train_x_transform_m1_elu = encoder_layer_rep_m1_elu.predict(train_x)
val_x_transform_m1_elu = encoder_layer_rep_m1_elu.predict(val_x)
clf_m1_elu = RandomForestClassifier().fit(train_x_transform_m1_elu, train_y)
pred_y_m1_elu = clf_m1_elu.predict(val_x_transform_m1_elu)
print ("")
print ("Classification Report: ")
print (classification_report(val_y, pred_y_m1_elu))
print ("")
print ("Confusion Matrix: \n", confusion_matrix(val_y, pred_y_m1_elu, labels=[1,0]))
print ("Accuracy Score: ", accuracy_score(val_y, pred_y_m1_elu))
print ("Precision Score: ", precision_score(val_y, pred_y_m1_elu))
print ("Recall Score: ", recall_score(val_y, pred_y_m1_elu))
print ("F1 Score: ", f1_score(val_y, pred_y_m1_elu))
# PR-AUC
disp_m1_elu = plot_precision_recall_curve(clf_m1_elu, val_x_transform_m1_elu, val_y)
disp_m1_elu.ax_.set_title('Model 1 ELU Precision-Recall curve')
Model building
# input layer
in_layer = Input(shape=(X.shape[1],),name="M1ADAM_tanh_input_ly")
# encoder layer
en_layers1 = Dense(100, activation='tanh', activity_regularizer=regularizers.l1(10e-5), name="M1ADAM_tanh_en_ly1")(in_layer)
en_layers2 = Dense(50, activation='tanh', name="M1ADAM_tanh_en_ly2")(en_layers1)
# decoder layer
de_layers1 = Dense(50, activation='tanh', name="M1ADAM_tanh_de_ly1")(en_layers2)
de_layers2 = Dense(100, activation='tanh', name="M1ADAM_tanh_de_ly2")(de_layers1)
# output layer
out_layer = Dense(X.shape[1], activation='tanh',name="M1ADAM_tanh_output_ly")(de_layers2)
# combine encoder and decoder
autoencoder_01_adam_tanh = Model(in_layer, out_layer)
autoencoder_01_adam_tanh.compile(optimizer="adam", loss="mse")
autoencoder_01_adam_tanh.summary()
plot_model(autoencoder_01_adam_tanh,show_shapes=True)
au_01_adam_tanh_his = autoencoder_01_adam_tanh.fit(x_norm[0:10000], x_norm[0:10000],
batch_size = 5, epochs = 12,
shuffle = True, validation_split = 0.20);
pd.DataFrame(au_01_adam_tanh_his.history).plot(figsize=(8, 5))
plt.grid(True)
plt.title("Model 1 ADAM tanh training")
plt.gca().set_ylim(0, 1)
plt.show()
Visualization
encoder_layer_rep_m1_adam_tanh = Sequential([
autoencoder_01_adam_tanh.layers[0],
autoencoder_01_adam_tanh.layers[1],
autoencoder_01_adam_tanh.layers[2]
])
norm_hid_rep_m1_adam_tanh = encoder_layer_rep_m1_adam_tanh.predict(x_norm[:3000])
fraud_hid_rep_m1_adam_tanh = encoder_layer_rep_m1_adam_tanh.predict(x_fraud[:200])
## Visualize the encoder representaion using TSNE
rep_x = np.append(norm_hid_rep_m1_adam_tanh, fraud_hid_rep_m1_adam_tanh, axis = 0)
# rep_y are the same for all model
tsne_plot(rep_x, rep_y, "Model 1 ADAM tanh")
Evaluation
train_x_transform_m1_adam_tanh = encoder_layer_rep_m1_adam_tanh.predict(train_x)
val_x_transform_m1_adam_tanh = encoder_layer_rep_m1_adam_tanh.predict(val_x)
clf_m1_adam_tanh = RandomForestClassifier().fit(train_x_transform_m1_adam_tanh, train_y)
pred_y_m1_adam_tanh = clf_m1_adam_tanh.predict(val_x_transform_m1_adam_tanh)
print ("")
print ("Classification Report: ")
print (classification_report(val_y, pred_y_m1_adam_tanh))
print ("")
print ("Confusion Matrix: \n", confusion_matrix(val_y, pred_y_m1_adam_tanh, labels=[1,0]))
print ("Accuracy Score: ", accuracy_score(val_y, pred_y_m1_adam_tanh))
print ("Precision Score: ", precision_score(val_y, pred_y_m1_adam_tanh))
print ("Recall Score: ", recall_score(val_y, pred_y_m1_adam_tanh))
print ("F1 Score: ", f1_score(val_y, pred_y_m1_adam_tanh))
# PR-AUC
disp_m1_adam_tanh = plot_precision_recall_curve(clf_m1_adam_tanh, val_x_transform_m1_adam_tanh, val_y)
disp_m1_adam_tanh.ax_.set_title('Model 1 ADAM tanh Precision-Recall curve')
Model building
# input layer
in_layer = Input(shape=(X.shape[1],),name="M1ADAM_relu_input_ly")
# encoder layer
en_layers1 = Dense(100, activation='relu', activity_regularizer=regularizers.l1(10e-5), name="M1ADAM_relu_en_ly1")(in_layer)
en_layers2 = Dense(50, activation='relu', name="M1ADAM_relu_en_ly2")(en_layers1)
# decoder layer
de_layers1 = Dense(50, activation='relu', name="M1ADAM_relu_de_ly1")(en_layers2)
de_layers2 = Dense(100, activation='relu', name="M1ADAM_relu_de_ly2")(de_layers1)
# output layer
out_layer = Dense(X.shape[1], activation='relu',name="M1ADAM_relu_output_ly")(de_layers2)
# combine encoder and decoder
autoencoder_01_adam_relu = Model(in_layer, out_layer)
autoencoder_01_adam_relu.compile(optimizer="adam", loss="mse")
autoencoder_01_adam_relu.summary()
plot_model(autoencoder_01_adam_relu,show_shapes=True)
au_01_adam_relu_his = autoencoder_01_adam_relu.fit(x_norm[0:10000], x_norm[0:10000],
batch_size = 5, epochs = 12,
shuffle = True, validation_split = 0.20);
pd.DataFrame(au_01_adam_relu_his.history).plot(figsize=(8, 5))
plt.grid(True)
plt.title("Model 1 ADAM RELU training")
plt.gca().set_ylim(0, 1)
plt.show()
Visualization
encoder_layer_rep_m1_adam_relu = Sequential([
autoencoder_01_adam_relu.layers[0],
autoencoder_01_adam_relu.layers[1],
autoencoder_01_adam_relu.layers[2]
])
norm_hid_rep_m1_adam_relu = encoder_layer_rep_m1_adam_relu.predict(x_norm[:3000])
fraud_hid_rep_m1_adam_relu = encoder_layer_rep_m1_adam_relu.predict(x_fraud[:200])
## Visualize the encoder representaion using TSNE
rep_x = np.append(norm_hid_rep_m1_adam_relu, fraud_hid_rep_m1_adam_relu, axis = 0)
# rep_y are the same for all model
tsne_plot(rep_x, rep_y, "Model 1 ADAM RELU")
Evaluation
train_x_transform_m1_adam_relu = encoder_layer_rep_m1_adam_relu.predict(train_x)
val_x_transform_m1_adam_relu = encoder_layer_rep_m1_adam_relu.predict(val_x)
clf_m1_adam_relu = RandomForestClassifier().fit(train_x_transform_m1_adam_relu, train_y)
pred_y_m1_adam_relu = clf_m1_adam_relu.predict(val_x_transform_m1_adam_relu)
print ("")
print ("Classification Report: ")
print (classification_report(val_y, pred_y_m1_adam_relu))
print ("")
print ("Confusion Matrix: \n", confusion_matrix(val_y, pred_y_m1_adam_relu, labels=[1,0]))
print ("Accuracy Score: ", accuracy_score(val_y, pred_y_m1_adam_relu))
print ("Precision Score: ", precision_score(val_y, pred_y_m1_adam_relu))
print ("Recall Score: ", recall_score(val_y, pred_y_m1_adam_relu))
print ("F1 Score: ", f1_score(val_y, pred_y_m1_adam_relu))
# PR-AUC
disp_m1_adam_relu = plot_precision_recall_curve(clf_m1_adam_relu, val_x_transform_m1_adam_relu, val_y)
disp_m1_adam_relu.ax_.set_title('Model 1 ADAM RELU Precision-Recall curve')
Model building
# input layer
in_layer = Input(shape=(X.shape[1],),name="M1ADAM_elu_input_ly")
# encoder layer
en_layers1 = Dense(100, activation='elu', activity_regularizer=regularizers.l1(10e-5), name="M1ADAM_elu_en_ly1")(in_layer)
en_layers2 = Dense(50, activation='elu', name="M1ADAM_elu_en_ly2")(en_layers1)
# decoder layer
de_layers1 = Dense(50, activation='elu', name="M1ADAM_elu_de_ly1")(en_layers2)
de_layers2 = Dense(100, activation='elu', name="M1ADAM_elu_de_ly2")(de_layers1)
# output layer
out_layer = Dense(X.shape[1], activation='elu',name="M1ADAM_elu_output_ly")(de_layers2)
# combine encoder and decoder
autoencoder_01_adam_elu = Model(in_layer, out_layer)
autoencoder_01_adam_elu.compile(optimizer="adam", loss="mse")
autoencoder_01_adam_elu.summary()
plot_model(autoencoder_01_adam_elu,show_shapes=True)
au_01_adam_elu_his = autoencoder_01_adam_elu.fit(x_norm[0:10000], x_norm[0:10000],
batch_size = 5, epochs = 12,
shuffle = True, validation_split = 0.20);
pd.DataFrame(au_01_adam_elu_his.history).plot(figsize=(8, 5))
plt.grid(True)
plt.title("Model 1 ADAM ELU training")
plt.gca().set_ylim(0, 1)
plt.show()
Visualization
encoder_layer_rep_m1_adam_elu = Sequential([
autoencoder_01_adam_elu.layers[0],
autoencoder_01_adam_elu.layers[1],
autoencoder_01_adam_elu.layers[2]
])
norm_hid_rep_m1_adam_elu = encoder_layer_rep_m1_adam_elu.predict(x_norm[:3000])
fraud_hid_rep_m1_adam_elu = encoder_layer_rep_m1_adam_elu.predict(x_fraud[:200])
## Visualize the encoder representaion using TSNE
rep_x = np.append(norm_hid_rep_m1_adam_elu, fraud_hid_rep_m1_adam_elu, axis = 0)
# rep_y are the same for all model
tsne_plot(rep_x, rep_y, "Model 1 ADAM ELU")
Evaluation